import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.impute import SimpleImputer
import statsmodels.api as sm
df_cars24 = pd.read_csv(r'D:\innomatics assignments\eda project webscrapping\cars24latest.csv')
df_cars24.head()
| Brand | Model | Year | Fuel Type | Distance Driven(KM) | Transmission | Price in Lakhs | EMI/month | City | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | Tata | PUNCH | 2021 | Petrol | 28,773 | Automatic | 7.08 | 13,842 | Hyderabad |
| 1 | Tata | Tiago | 2017 | Petrol | 72,629 | Manual | 4.32 | 8,446 | Hyderabad |
| 2 | Tata | TIGOR | 2021 | Petrol | 47,307 | Manual | 5.86 | 11,456 | Hyderabad |
| 3 | Tata | ALTROZ | 2021 | Petrol | 42,996 | Manual | 7.53 | 14,721 | Hyderabad |
| 4 | Tata | Tiago | 2017 | Petrol | 62,204 | Automatic | 4.74 | 9,267 | Hyderabad |
DATA CLEANING¶
#data cleaning - removing commas from values in emi/month column and converting them to int type
df_cars24['EMI/month'] = df_cars24['EMI/month'].str.replace(',', '').astype(float)
#data cleaning - removing commas from values in distance driven column and converting them to int type
df_cars24['Distance Driven(KM)'] = df_cars24['Distance Driven(KM)'].str.replace(',', '').astype(float)
#data overview
df_cars24.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1507 entries, 0 to 1506 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Brand 1507 non-null object 1 Model 1507 non-null object 2 Year 1507 non-null int64 3 Fuel Type 1507 non-null object 4 Distance Driven(KM) 1504 non-null float64 5 Transmission 1460 non-null object 6 Price in Lakhs 1507 non-null float64 7 EMI/month 1507 non-null float64 8 City 1507 non-null object dtypes: float64(3), int64(1), object(5) memory usage: 106.1+ KB
#convert values in model into all lowercase letters
df_cars24['Model'] = df_cars24['Model'].apply(lambda x : x.lower())
imputing missing values¶
#fuilling null values in transmission column
df_cars24["Transmission"].fillna( method ='ffill', limit = 2, inplace = True)
#filling null values in Distance Driven cloumn
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values = np.nan,strategy='mean')
# select the column to impute
column_to_impute = 'Distance Driven(KM)'
# impute missing values in the selected column
imputed_column = imputer.fit_transform(df_cars24[[column_to_impute]])
# replace the original column with the imputed column
df_cars24[column_to_impute] = imputed_column
df = df_cars24
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1507 entries, 0 to 1506 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Brand 1507 non-null object 1 Model 1507 non-null object 2 Year 1507 non-null int64 3 Fuel Type 1507 non-null object 4 Distance Driven(KM) 1507 non-null float64 5 Transmission 1507 non-null object 6 Price in Lakhs 1507 non-null float64 7 EMI/month 1507 non-null float64 8 City 1507 non-null object dtypes: float64(3), int64(1), object(5) memory usage: 106.1+ KB
#finding duplicate values
duplicateRows = df_cars24[df_cars24.duplicated()]
len(duplicateRows)
0
No duplicate values foung
#description of numerical data
df_cars24.describe()
| Year | Distance Driven(KM) | Price in Lakhs | EMI/month | |
|---|---|---|---|---|
| count | 1507.000000 | 1507.000000 | 1507.000000 | 1507.000000 |
| mean | 2018.087591 | 44831.054521 | 7.115455 | 14918.420703 |
| std | 3.089892 | 27077.632571 | 3.729947 | 8360.063750 |
| min | 2010.000000 | 100.000000 | 1.490000 | 4076.000000 |
| 25% | 2016.000000 | 22439.500000 | 4.525000 | 9716.000000 |
| 50% | 2019.000000 | 42533.000000 | 6.280000 | 13044.000000 |
| 75% | 2021.000000 | 65957.500000 | 8.805000 | 17488.000000 |
| max | 2023.000000 | 99768.000000 | 26.080000 | 93596.000000 |
#descrition of categorical data
df_cars24.describe(include='O')
| Brand | Model | Fuel Type | Transmission | City | |
|---|---|---|---|---|---|
| count | 1507 | 1507 | 1507 | 1507 | 1507 |
| unique | 9 | 123 | 3 | 2 | 10 |
| top | Maruti | kwid | Petrol | Manual | Bangalore |
| freq | 200 | 113 | 1174 | 1104 | 180 |
#model year range
df_cars24['Year'].min(),df_cars24['Year'].max()
(2010, 2023)
Exploring the Data:¶
Vehicle Information Summary: Columns Type : • Numerical — kilometer, price , Emi • Categorical — year, brand,model,city,transmission
# km driven range
df_cars24['Distance Driven(KM)'].min(),df_cars24['Distance Driven(KM)'].max()
(100.0, 99768.0)
#price range
df_cars24['Price in Lakhs'].min(),df_cars24['Price in Lakhs'].max()
(1.49, 26.08)
1.Model year ranges between 2010 to 2023 2.distance driven is ranges between 100 km to 99,786 km 3.price ranges between Rs149000 to Rs26,08,000(1.49 lakhs to 26.08 lakhs)
#after driving how much distance the people will tend to sell the car
df_cars24['Distance Driven(KM)'].mean()
44831.0545212766
After driving an average distance of 45000 km people tend to sell the car
#most selling brand
df_cars24['Brand'].value_counts().plot(kind='bar')
plt.xlabel('Brand')
plt.ylabel('Highest selling')
plt.show()
len(df_cars24[(df_cars24.Brand=='Maruti')|(df_cars24.Brand=='Hyundai')|(df_cars24.Brand=='Honda')])/len(df_cars24)
0.39814200398142
Among all the nine brands the market share of honda Maruti and Hyundai is 40%
#Price distribution
sns.histplot(df_cars24['Price in Lakhs'],kde=True)
<Axes: xlabel='Price in Lakhs', ylabel='Count'>
df_cars24['Price in Lakhs'].mean(), df_cars24['Price in Lakhs'].median()
(7.115454545454545, 6.28)
1.the average price is about 7 lakhs and median price of used cars is 6 lakhs
2.the price distribution is more like longtail-distribution and there is right skewness we can observe in this distribution which is normal in price distribution which means outliers lie at the higher end prices
df_cars24["Fuel Type"].value_counts().plot(kind="pie")
<Axes: ylabel='count'>
#which type of car petrol or diesel or CNG is most buyed
fig = px.bar(df_cars24['Fuel Type'].value_counts())
fig.show()
print(df_cars24['Fuel Type'].value_counts())
print(f'percentage of petrol cars = {1174/len(df_cars24)}')
print(f'percentage of diesel cars = {289/len(df_cars24)}')
print(f'percentage of CNG cars = {44/len(df_cars24)}')
Fuel Type Petrol 1174 Diesel 289 CNG 44 Name: count, dtype: int64 percentage of petrol cars = 0.7790311877903119 percentage of diesel cars = 0.19177173191771732 percentage of CNG cars = 0.029197080291970802
Fuel type of majority of cars is Petrol which is 77% the least bought cars are of CNG which stands at 3%
#which type of car Automatic or Manual is most buyed
px.bar(df_cars24["Transmission"].value_counts())
print(df_cars24['Transmission'].value_counts())
print(f'percentage of Automatic cars = {391/len(df_cars24)}')
print(f'percentage of Manual cars = {1069/len(df_cars24)}')
Transmission Manual 1104 Automatic 403 Name: count, dtype: int64 percentage of Automatic cars = 0.2594558725945587 percentage of Manual cars = 0.7093563370935634
cars with manual transmission are higher than automatic transmission ie 70% of cars are manual transmssion
Bivariate analysis :#¶
#Vehicle listings in each city
df_cars24.groupby(['City'])['Year'].count().sort_values(ascending=False).plot(kind='bar', figsize=(10,6))
plt.ylabel('Number of listings per city', fontsize=12)
plt.xlabel('city', fontsize=12)
plt.title('Listings per city', fontsize=18)
plt.show()
len(df_cars24[(df_cars24.City == 'Bangalore')|(df_cars24.City == 'Delhi')|(df_cars24.City == 'Gurgaon')] )/len(df_cars24)
0.35567352355673526
Delhi, Bangalore, Grugaon are the 3 cities with the largest number of used cars listing, and these these three cities account for 35% of used cars among the other top cities present
len(df_cars24[(df_cars24.City == 'Ahmedabad') ])/len(df_cars24)
0.07631055076310551
the listings at ahmedabad are least which is 7% of the total
#Average car price of each Brand
df_out = df_cars24.groupby('Brand')['Price in Lakhs'].mean().reset_index(name='Average Price').sort_values(by=['Average Price'],ascending = False)
print(df_out)
sns.barplot(x=df_out['Brand'],y=df_out['Average Price'],errorbar=None)
Brand Average Price 2 KIA 12.603833 3 Mahindra 9.759167 6 Tata 7.999585 7 Toyota 7.618750 1 Hyundai 6.341550 0 Honda 6.263650 8 Volkswagen 6.017597 4 Maruti 5.191650 5 Renault 4.891818
<Axes: xlabel='Brand', ylabel='Average Price'>
Among all the brands Kia cars have the highest average price of about 12 lakhs followed by mahindra and tata with 9 lakhs and 8 lakhs respectively
# Which Barand cars are sold earlier and later (less driven car brands and most driven car brands)
df_out1 = df_cars24.groupby('Brand')['Distance Driven(KM)'].mean().reset_index(name='Distance Driven').sort_values(by=['Distance Driven'],ascending = False)
print(df_out1)
sns.barplot(y=df_out1['Brand'],x=df_out1['Distance Driven'],errorbar=None)
Brand Distance Driven 8 Volkswagen 54403.727273 7 Toyota 51950.733333 0 Honda 50989.245000 1 Hyundai 47765.605273 4 Maruti 46326.085273 3 Mahindra 43991.423611 2 KIA 36390.733333 5 Renault 36222.892045 6 Tata 35518.487329
<Axes: xlabel='Distance Driven', ylabel='Brand'>
Tata cars are sold very earlier only after driving about 35500 km only where as volkswagen cars are sold after driving 55000 km
#average car price city wise
df_out2 = df_cars24.groupby('City')['Price in Lakhs'].mean().reset_index(name='Average Price').sort_values(by=['Average Price'],ascending = False)
df_out2 = df_out2[ (df_out2['City'] != 'Chennai') & (df_out2['City'] != 'Kolkatta')&(df_out2['City'] != 'Mumbai')]
print(df_out2)
sns.barplot(y=df_out2['City'],x=df_out2['Average Price'], errorbar=None)
City Average Price 1 Bangalore 8.469944 0 Ahmedabad 7.755565 5 Hyderabad 7.348686 4 Gurgaon 7.321348 9 Pune 7.057833 8 Noida 6.943652 3 Delhi 6.929213
<Axes: xlabel='Average Price', ylabel='City'>
Average price of used cars is way costlier in Bangalore which is 8.5 lakhs and cheaper in delhi which is around 6 lakhs
#Comparing year and price
sns.lineplot(x=df_cars24['Year'],y=df_cars24["Price in Lakhs"],errorbar=None)
<Axes: xlabel='Year', ylabel='Price in Lakhs'>
Its is very obvious that recently bought cars are sold at higher price when compared to older models
#how will fuel type impact resale price
sns.boxplot(x = df_cars24['Fuel Type'],y=df_cars24["Price in Lakhs"])
<Axes: xlabel='Fuel Type', ylabel='Price in Lakhs'>
from the above graph it is very clear that among other fuel types diesel cars prices are higher thats why most people want to buy petrol cars
multivariate analysis#¶
# in each year which type of transmission cars are highest
sns.barplot(x=df_cars24['Year'],y=df_cars24['Price in Lakhs'],errorbar=None,hue=df_cars24['Transmission'])
<Axes: xlabel='Year', ylabel='Price in Lakhs'>
In almist every year from 2010 to 2023 automatic cars prices higher than manual cars
df_cars24.head()
| Brand | Model | Year | Fuel Type | Distance Driven(KM) | Transmission | Price in Lakhs | EMI/month | City | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | Tata | punch | 2021 | Petrol | 28773.0 | Automatic | 7.08 | 13842.0 | Hyderabad |
| 1 | Tata | tiago | 2017 | Petrol | 72629.0 | Manual | 4.32 | 8446.0 | Hyderabad |
| 2 | Tata | tigor | 2021 | Petrol | 47307.0 | Manual | 5.86 | 11456.0 | Hyderabad |
| 3 | Tata | altroz | 2021 | Petrol | 42996.0 | Manual | 7.53 | 14721.0 | Hyderabad |
| 4 | Tata | tiago | 2017 | Petrol | 62204.0 | Automatic | 4.74 | 9267.0 | Hyderabad |
df
| Brand | Model | Year | Fuel Type | Distance Driven(KM) | Transmission | Price in Lakhs | EMI/month | City | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | Tata | punch | 2021 | Petrol | 28773.0 | Automatic | 7.08 | 13842.0 | Hyderabad |
| 1 | Tata | tiago | 2017 | Petrol | 72629.0 | Manual | 4.32 | 8446.0 | Hyderabad |
| 2 | Tata | tigor | 2021 | Petrol | 47307.0 | Manual | 5.86 | 11456.0 | Hyderabad |
| 3 | Tata | altroz | 2021 | Petrol | 42996.0 | Manual | 7.53 | 14721.0 | Hyderabad |
| 4 | Tata | tiago | 2017 | Petrol | 62204.0 | Automatic | 4.74 | 9267.0 | Hyderabad |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1502 | Volkswagen | vento | 2010 | Petrol | 74715.0 | Manual | 2.00 | 17770.0 | Noida |
| 1503 | Volkswagen | ameo | 2017 | Petrol | 89489.0 | Manual | 3.52 | 6882.0 | Kolkatta |
| 1504 | Volkswagen | vento | 2015 | Petrol | 58015.0 | Automatic | 4.08 | 7976.0 | Kolkatta |
| 1505 | Volkswagen | ameo | 2017 | Diesel | 46293.0 | Automatic | 4.41 | 8622.0 | Kolkatta |
| 1506 | Volkswagen | ameo | 2019 | Petrol | 59396.0 | Manual | 4.24 | 8289.0 | Kolkatta |
1507 rows × 9 columns
pivot = pd.pivot_table(df,index=['City', 'Brand'],
values=['Distance Driven(KM)'], aggfunc='mean')
print(pivot)
Distance Driven(KM)
City Brand
Ahmedabad Honda 49775.800000
Hyundai 39033.050000
KIA 34491.666667
Mahindra 33563.666667
Maruti 51440.250000
... ...
Pune Maruti 53266.050000
Renault 24012.000000
Tata 36167.400000
Toyota 23325.500000
Volkswagen 58435.615385
[90 rows x 1 columns]
df2 = df[['Year', 'Distance Driven(KM)' , 'Price in Lakhs', 'EMI/month']].copy()
df2
| Year | Distance Driven(KM) | Price in Lakhs | EMI/month | |
|---|---|---|---|---|
| 0 | 2021 | 28773.0 | 7.08 | 13842.0 |
| 1 | 2017 | 72629.0 | 4.32 | 8446.0 |
| 2 | 2021 | 47307.0 | 5.86 | 11456.0 |
| 3 | 2021 | 42996.0 | 7.53 | 14721.0 |
| 4 | 2017 | 62204.0 | 4.74 | 9267.0 |
| ... | ... | ... | ... | ... |
| 1502 | 2010 | 74715.0 | 2.00 | 17770.0 |
| 1503 | 2017 | 89489.0 | 3.52 | 6882.0 |
| 1504 | 2015 | 58015.0 | 4.08 | 7976.0 |
| 1505 | 2017 | 46293.0 | 4.41 | 8622.0 |
| 1506 | 2019 | 59396.0 | 4.24 | 8289.0 |
1507 rows × 4 columns
sns.heatmap(df2.corr(),annot=True)
<Axes: >